# Set seed for reproducibility
set.seed(123)
# Register Stadia Maps API key
api_key <- "45381642-ef1c-4009-90a4-e54942f742c7"
register_stadiamaps(key = api_key)
# Get the name of working directory in RProject
proj.path <- getwd()
# Read the raw hotspots data from the data folder in the RProject directory
hotspots_raw <- read_csv(file.path(proj.path, 'data', 'hotspots.csv'))
## Rows: 1781266 Columns: 39
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): source, sensor, satellite, agency, fuel
## dbl (33): lat, lon, uid, temp, rh, ws, wd, pcp, ffmc, dmc, dc, isi, bui, fw...
## dttm (1): rep_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert rep_date to POSIXct format
hotspots_raw$rep_date <- as.POSIXct(hotspots_raw$rep_date, format = "%Y-%m-%d %H:%M:%S")
# Add year and month columns to the data
hotspots_raw$year <- year(hotspots_raw$rep_date)
hotspots_raw$month <- month(hotspots_raw$rep_date, label = TRUE, abbr = TRUE)
# Filter the data to include only records from 2014 to 2023
hotspots <- hotspots_raw %>%
filter(year >= 2014 & year <= 2023)
# Summarize the number of fire events per year and month
# This helps us understand the distribution of fire events over time
fire_events_per_month <- hotspots %>%
group_by(year, month) %>%
summarise(n_events = n(), .groups = 'drop')
# Print the wide format summary table to inspect the reshaped data
print(fire_events_wide)
## # A tibble: 10 × 13
## year Apr May Jun Jul Aug Sep Oct Nov Dec Mar Jan
## <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 2014 61 215 114 10474 20910 1996 2443 3952 78 0 0
## 2 2015 221 7840 4920 18147 4690 921 8061 3134 0 281 0
## 3 2016 1667 6453 137 191 1321 1492 1500 1814 269 101 0
## 4 2017 88 435 320 79606 152262 23580 13943 835 0 87 0
## 5 2018 244 1992 1477 5790 336899 9402 9795 2350 0 40 46
## 6 2019 310 1838 246 256 3754 3494 18946 4067 0 176 0
## 7 2020 51 209 40 112 3644 2429 5073 818 252 93 0
## 8 2021 719 283 2739 157537 64954 896 6423 3883 480 357 706
## 9 2022 96 50 144 2513 5967 24132 13781 4718 358 238 261
## 10 2023 528 43256 89281 150531 161000 237760 5999 6019 1197 192 439
## # ℹ 1 more variable: Feb <int>
hotspots_peak <- hotspots %>%
filter(month(rep_date) %in% c(5, 6, 7, 8, 9, 10))
# Print the wide format summary table for the peak season
print(fire_events_subset_wide)
## # A tibble: 10 × 7
## year May Jun Jul Aug Sep Oct
## <dbl> <int> <int> <int> <int> <int> <int>
## 1 2014 215 114 10474 20910 1996 2443
## 2 2015 7840 4920 18147 4690 921 8061
## 3 2016 6453 137 191 1321 1492 1500
## 4 2017 435 320 79606 152262 23580 13943
## 5 2018 1992 1477 5790 336899 9402 9795
## 6 2019 1838 246 256 3754 3494 18946
## 7 2020 209 40 112 3644 2429 5073
## 8 2021 283 2739 157537 64954 896 6423
## 9 2022 50 144 2513 5967 24132 13781
## 10 2023 43256 89281 150531 161000 237760 5999
Prepare the data for clustering (latitude, longitude, date)
Convert latitude and longitude from degrees to kilometers (1 degree ~
111 km)
Calculate time in hours from the first recorded event
Note: time_hours is a component in clustering
to group together events occurring within a certain time frame (20
hours).
event_data <- hotspots_peak %>%
select(lat, lon, rep_date) %>%
mutate(
lat_km = lat * 111,
lon_km = lon * 111,
time_hours = as.numeric(difftime(rep_date, min(rep_date), units = "hours"))
) %>%
select(lat_km, lon_km, time_hours) # Reassign event_data to include only the necessary columns
# Function to apply DBSCAN and count clusters
apply_dbscan <- function(data, eps_value, minPts_value) {
db <- dbscan(data, eps = eps_value, minPts = minPts_value) # Apply DBSCAN
data$event_cluster <- db$cluster # Add cluster labels
num_clusters <- length(unique(data$event_cluster)) - 1 # Count clusters excluding noise (0)
num_noise <- sum(data$event_cluster == 0) # Count noise points
cat("eps =", eps_value, ", minPts =", minPts_value, ": Number of clusters =", num_clusters, ", Number of noise points =", num_noise, "\n")
return(num_clusters)
}
# Parameters for the loop
desired_clusters <- 15095 # Official number of fires
best_eps <- NA # Placeholder for best eps value
best_minPts <- NA # Placeholder for best minPts value
best_diff <- Inf # Placeholder for the smallest difference
# Iterate over eps and minPts values to find the best match
for (eps_val in seq(5, 15, by = 1)) { # Adjust range and step size as needed
for (minPts_val in seq(1, 5, by = 1)) { # Adjust range and step size as needed
num_clusters <- apply_dbscan(event_data, eps_val, minPts_val)
diff <- abs(num_clusters - desired_clusters)
if (diff < best_diff) { # Check if this is the best match so far
best_eps <- eps_val
best_minPts <- minPts_val
best_diff <- diff
}
}
}
# Print out the best parameters found
cat("Best eps value:", best_eps, "\n") # Best eps value: 11
cat("Best minPts value:", best_minPts, "\n") # Best minPts value: 2
The best parameters found mean that fire events within an 11 km
radius
are grouped into clusters,
and 2 events minimum form a cluster.
# Apply DBSCAN with the best parameters found
best_eps <- 11
best_minPts <- 2
best_db <- dbscan(event_data, eps = best_eps, minPts = best_minPts)
# Add cluster labels to the original dataset
hotspots_peak$event_cluster <- best_db$cluster
# Filter out noise (cluster 0)
# Cluster 0 represents noise points, which are outliers that do not belong to any cluster.
hotspots_peak <- hotspots_peak %>%
filter(event_cluster != 0)
# Number of clusters identified by DBSCAN
cat("Number of clusters identified by DBSCAN:", length(unique(hotspots_peak$event_cluster)), "\n")
## Number of clusters identified by DBSCAN: 15026
# Summary of clusters
cluster_summary <- hotspots_peak %>%
group_by(event_cluster) %>%
summarise(
start_date = min(rep_date), # Get the earliest date in the cluster
end_date = max(rep_date), # Get the latest date in the cluster
num_events = n(), # Count the number of events in the cluster
latitude = mean(lat), # Calculate the average latitude of events in the cluster
longitude = mean(lon) # Calculate the average longitude of events in the cluster
)
# Get the top 5 largest clusters from the cluster_summary
top_clusters <- cluster_summary %>%
arrange(desc(num_events)) %>%
slice_head(n = 5)
# Print the top clusters
print(top_clusters)
## # A tibble: 5 × 6
## event_cluster start_date end_date num_events latitude
## <int> <dttm> <dttm> <int> <dbl>
## 1 3930 2018-08-04 16:56:00 2018-08-22 20:06:00 62103 53.0
## 2 2408 2017-07-30 04:31:00 2017-08-13 02:04:00 61622 52.7
## 3 12440 2023-09-14 03:57:00 2023-09-24 04:08:00 55183 58.7
## 4 3928 2018-08-05 16:38:00 2018-08-22 20:06:00 30660 54.2
## 5 2388 2017-07-23 02:47:00 2017-08-13 05:00:00 30152 51.1
## # ℹ 1 more variable: longitude <dbl>
# Function to plot clusters on map
plot_clusters_on_map <- function(cluster_ids, data, zoom_level = 10) {
# Filter data for the specific clusters
cluster_data <- data %>% filter(event_cluster %in% cluster_ids)
# Define the bounding box for the map based on the clusters' coordinates
bbox <- c(left = min(cluster_data$lon) - 0.3,
bottom = min(cluster_data$lat) - 0.3,
right = max(cluster_data$lon) + 0.3,
top = max(cluster_data$lat) + 0.3)
# Get the map from Stadia Maps API
map <- get_stadiamap(bbox = bbox, zoom = zoom_level, maptype = "stamen_terrain")
# Plot the clusters data on the map
print(ggmap(map) +
geom_point(data = cluster_data, aes(x = lon, y = lat, color = factor(event_cluster)), size = 2, alpha = 0.3, shape = 16) +
labs(title = "Matched Fire Clusters",
x = "Longitude",
y = "Latitude",
color = "Cluster ID") +
theme_minimal() +
theme(legend.position = "bottom") +
scale_color_manual(values = c("firebrick1", "orange", "yellow", "orange3", "yellow2", "red3", "red4")) # Custom colours
)
}
ID 3930 Tweedsmuir Complex Fire (2018)
ID 2408 Elephant Hill Fire (2017)
ID 12440 Fort Nelson Fire (2023)
ID 3928 Shovel Lake Fire (2018)
ID 2388 Hanceville-Riske Creek Fire (2017)
## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.
The plots show large areas affected by fires.
Each fire cluster covers a big region, indicating major fire
events.
Fire events within clusters are closely packed, showing high fire
activity.
The fires follow the terrain, influenced by vegetation and
topography.
# Known fire near Kelowna in 2023
kelowna_fire <- data.frame(
fire_name = "McDougall Creek Fire",
start_date = as.POSIXct("2023-08-16"),
end_date = as.POSIXct("2023-08-30"),
latitude = 49.8821,
longitude = -119.4370
)
# Calculate the distance from each cluster to Kelowna fire
cluster_summary <- cluster_summary %>%
mutate(
distance_to_kelowna_fire = distHaversine(
matrix(c(longitude, latitude), ncol = 2),
matrix(c(kelowna_fire$longitude, kelowna_fire$latitude), ncol = 2)
) / 1000 # Convert meters to kilometers
)
# Filter clusters that are within 50 km and have suitable dates
kelowna_clusters <- cluster_summary %>%
filter(
distance_to_kelowna_fire < 50 & # Check if within 50 km
start_date <= kelowna_fire$end_date & # Check if cluster starts before the known fire ends
end_date >= kelowna_fire$start_date # Check if cluster ends after the known fire starts
)
# Print the matching clusters
print(kelowna_clusters)
## # A tibble: 7 × 7
## event_cluster start_date end_date num_events latitude
## <int> <dttm> <dttm> <int> <dbl>
## 1 11964 2023-08-25 12:44:00 2023-08-29 17:49:00 982 50.0
## 2 11969 2023-08-16 16:54:00 2023-08-20 05:05:00 2770 49.9
## 3 13047 2023-08-20 16:28:00 2023-08-21 04:46:00 446 50.0
## 4 13124 2023-08-21 16:09:00 2023-08-22 04:01:00 252 49.9
## 5 13384 2023-08-23 01:51:00 2023-08-23 04:59:00 15 49.9
## 6 13402 2023-08-23 03:19:00 2023-08-23 04:08:00 4 50.0
## 7 13694 2023-08-25 01:36:00 2023-08-25 04:29:00 8 49.9
## # ℹ 2 more variables: longitude <dbl>, distance_to_kelowna_fire <dbl>
## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.
The dataset contains different variables
that are important to analyze fire events in British Columbia.
These variables can be broadly categorized into the following
categories:
Location, Time, Data Collection Methods, Weather Data, Fire Indices,
Other.
lat: Latitude of the fire event.
lon: Longitude of the fire event.
# Summary of Latitude and Longitude
lat_range <- range(hotspots$lat, na.rm = TRUE)
lon_range <- range(hotspots$lon, na.rm = TRUE)
# British Columbia coordinates:
# Latitude: Between 48.309789°N and 60.00065°N
# Longitude: Between -139.058200 °W and -114.05423°W
cat("Latitude Range:", lat_range, "\n")
## Latitude Range: 48.32883 60.0009
cat("Longitude Range:", lon_range, "\n")
## Longitude Range: -138.328 -114.09
rep_date: The date and time when the fire event was
reported.
month and year are new variables added
# Summary table describing each hotspots dataset by year
hotspots_df_summary <- hotspots %>%
group_by(year) %>%
summarise(
start_date = min(rep_date),
end_date = max(rep_date),
hotspots_df_day_count = as.numeric(difftime(max(rep_date), min(rep_date), units = "days"))
)
print(hotspots_df_summary)
## # A tibble: 10 × 4
## year start_date end_date hotspots_df_day_count
## <dbl> <dttm> <dttm> <dbl>
## 1 2014 2014-04-04 18:50:00 2014-12-07 20:37:00 247.
## 2 2015 2015-03-07 06:05:00 2015-11-22 21:23:00 261.
## 3 2016 2016-03-25 04:10:00 2016-12-21 04:55:00 271.
## 4 2017 2017-03-03 04:15:00 2017-11-15 11:41:00 257.
## 5 2018 2018-01-06 03:11:00 2018-11-07 18:19:00 306.
## 6 2019 2019-03-13 02:06:00 2019-11-19 18:54:00 252.
## 7 2020 2020-03-03 03:45:00 2020-12-29 05:05:00 301.
## 8 2021 2021-01-06 13:35:00 2021-12-11 18:36:00 339.
## 9 2022 2022-01-11 13:22:00 2022-12-31 19:06:00 354.
## 10 2023 2023-01-01 05:01:00 2023-12-31 19:14:00 365.
This barplot shows, how many days each year had recorded fire
events.
It also shows that reporting period has grown over the years.
# Summary table for the peak season (May to October) describing each hotspots dataset by year
hotspots_peak_df_summary <- hotspots_peak %>%
group_by(year) %>%
summarise(
start_date = min(rep_date),
end_date = max(rep_date),
hotspots_peak_day_count = as.numeric(difftime(max(rep_date), min(rep_date), units = "days")),
num_clusters = n_distinct(event_cluster) # Number of clusters in the peak season for each year
)
print(hotspots_peak_df_summary)
## # A tibble: 10 × 5
## year start_date end_date hotspots_peak_day_count
## <dbl> <dttm> <dttm> <dbl>
## 1 2014 2014-05-01 18:35:00 2014-10-31 23:44:00 183.
## 2 2015 2015-05-03 05:17:00 2015-10-31 22:23:00 182.
## 3 2016 2016-05-01 04:44:00 2016-10-31 06:30:00 183.
## 4 2017 2017-05-02 19:49:00 2017-10-31 21:27:00 182.
## 5 2018 2018-05-01 16:41:00 2018-10-31 04:40:00 182.
## 6 2019 2019-05-01 03:47:00 2019-10-31 18:48:00 184.
## 7 2020 2020-05-03 03:48:00 2020-10-31 16:07:00 182.
## 8 2021 2021-05-05 02:24:00 2021-10-31 18:47:00 180.
## 9 2022 2022-05-04 01:49:00 2022-10-31 18:02:00 181.
## 10 2023 2023-05-01 01:09:00 2023-10-31 18:57:00 184.
## # ℹ 1 more variable: num_clusters <int>
The table shows the earliest and latest dates of fire event
reporting,
the total number of days with recorded fire events,
and the number of unique fire clusters detected during the peak season
for each year.
The barplot shows the number of unique fire clusters
detected each year during the peak fire season.
This number fluctuates each year,
with noticeable peaks in 2018, 2021, and 2023.
Different sources, satellites, and sensors are used for data collection.
# Summary and Visualization of Sources
source_summary <- hotspots %>%
group_by(source) %>%
summarise(num_events = n())
print(source_summary)
## # A tibble: 15 × 2
## source num_events
## <chr> <int>
## 1 HMS 4158
## 2 NASA 232234
## 3 NASA1 5013
## 4 NASA2 339423
## 5 NASA3 330797
## 6 NASA6 5091
## 7 NASA7 5410
## 8 NASA_can 12899
## 9 NASA_usa 435
## 10 NASAkmz 89
## 11 NASAwcan 49947
## 12 NASAwusa 143
## 13 NOAA 172479
## 14 UMD 28514
## 15 USFS 589933
# Summary and Visualization of Satellites
satellite_summary <- hotspots %>%
group_by(satellite) %>%
summarise(num_events = n())
print(satellite_summary)
## # A tibble: 15 × 2
## satellite num_events
## <chr> <int>
## 1 Aqua 80449
## 2 JPSS1 7152
## 3 L8 1297
## 4 Landsat-8 16
## 5 METOP-A 49806
## 6 METOP-B 40999
## 7 N 767
## 8 NOAA-15 9296
## 9 NOAA-18 28695
## 10 NOAA-19 35724
## 11 NOAA-20 432901
## 12 NPP 1201
## 13 S-NPP 927884
## 14 Terra 110268
## 15 <NA> 50110
# Summary and Visualization of Sensors
sensor_summary <- hotspots %>%
group_by(sensor) %>%
summarise(num_events = n())
#Advanced Very High Resolution Radiometer (AVHRR) imagery, courtesy of the NOAA
#Moderate Resolution Imaging Spectroradiometer (MODIS) imagery, courtesy of the NASA
#Visible Infrared Imaging Radiometer Suite (VIIRS) imagery, courtesy of NASA LANCE FIRMS
print(sensor_summary)
## # A tibble: 8 × 2
## sensor num_events
## <chr> <int>
## 1 AVHRR 164520
## 2 IBAND 400710
## 3 Landsat 1014
## 4 MODIS 240827
## 5 OLI 299
## 6 VIIRS 94345
## 7 VIIRS-I 869363
## 8 VIIRS-M 5487
kelowna_cluster_ids <- kelowna_clusters$event_cluster
# Filter the data for the specific cluster
event_data <- hotspots_peak %>%
filter(event_cluster %in% kelowna_cluster_ids)
# Summarize the sources, satellites, and sensors used for this event
event_sources <- event_data %>%
group_by(source) %>%
summarise(num_events = n())
event_satellites <- event_data %>%
group_by(satellite) %>%
summarise(num_events = n())
event_sensors <- event_data %>%
group_by(sensor) %>%
summarise(num_events = n())
# Print summaries
print(event_sources)
## # A tibble: 7 × 2
## source num_events
## <chr> <int>
## 1 NASA2 1658
## 2 NASA3 1557
## 3 NASA6 460
## 4 NASA7 404
## 5 NASA_can 5
## 6 NASAwcan 366
## 7 NASAwusa 27
print(event_satellites)
## # A tibble: 5 × 2
## satellite num_events
## <chr> <int>
## 1 Aqua 3
## 2 NOAA-20 1961
## 3 S-NPP 2118
## 4 Terra 2
## 5 <NA> 393
print(event_sensors)
## # A tibble: 2 × 2
## sensor num_events
## <chr> <int>
## 1 MODIS 398
## 2 VIIRS-I 4079
In the Kelowna fire clusters,
data came from various sources, satellites, and sensors.
Most data were provided by NASA sources and the VIIRS-I sensor,
mainly using the S-NPP satellite.
The VIIRS-I sensor and S-NPP satellite are the most used
both for this event and the overall data.
Weather conditions are critical factors in the spread of fires.
temp: Temperature (°C)
rh: Relative Humidity (%)
ws: Wind Speed (km/h)
wd: Wind Direction (degrees)
pcp: Precipitation (mm)
# Function to describe each numerical column
describe_numerical <- function(df, cols) {
summary_list <- list()
for (col in cols) {
summary_stats <- data.frame(
Variable = col,
Missing_Values = sum(is.na(df[[col]])),
Min = round(min(df[[col]], na.rm = TRUE), 2),
Median = round(median(df[[col]], na.rm = TRUE), 2),
Mean = round(mean(df[[col]], na.rm = TRUE), 2),
Max = round(max(df[[col]], na.rm = TRUE), 2)
)
summary_list[[col]] <- summary_stats
}
summary_table <- bind_rows(summary_list)
return(summary_table)
}
# Summary of weather data variables
summary_hotspots <- describe_numerical(hotspots_peak, c('temp', 'rh', 'ws', 'wd', 'pcp'))
print(summary_hotspots)
## Variable Missing_Values Min Median Mean Max
## 1 temp 0 -9.66 22.24 21.48 43.13
## 2 rh 0 11.00 33.00 35.48 100.00
## 3 ws 0 0.00 8.95 9.53 59.30
## 4 wd 0 0.00 213.00 202.25 360.00
## 5 pcp 0 0.00 0.00 0.23 78.59
There are no missing values in the weather data.
The minimum temperature recorded is -9.66°C, which is unusually low for
fire events.
The maximum temperature recorded is 43.13°C, which is expected during
peak fire seasons.
The relative humidity ranges from 11.00% to 100.00%, there are different
weather conditions.
Wind speeds range from 0.00 km/h to 59.3 km/h.
There is a full range of possible wind directions.
The precipitation data has a maximum value of 78.59.
# Summarize the count of subzero temperature events by year and month
subzero_temps_summary <- hotspots_peak %>%
filter(temp < 0) %>%
group_by(year, month) %>%
summarise(count = n(), .groups = 'drop') %>%
arrange(desc(year))
# Print the summary table
print(subzero_temps_summary)
## # A tibble: 11 × 3
## year month count
## <dbl> <ord> <int>
## 1 2023 Oct 1060
## 2 2022 Oct 82
## 3 2021 Oct 213
## 4 2020 Oct 666
## 5 2019 May 5
## 6 2019 Oct 1228
## 7 2018 Oct 22
## 8 2017 Oct 91
## 9 2016 Oct 2
## 10 2015 Oct 811
## 11 2014 Oct 9
The peak month for subzero temperature events is October,
even colder conditions do not stop fires from happening.
The median temperatures are between 20°C and 25°C.
2019 stands out with a slightly lower median temperature.
There are significant outliers present in all years,
particularly in 2014, 2017, 2018, 2021 and 2023.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
There is a clear seasonal pattern,
temperatures peak around July and August, then decrease towards
October.
Some years, like 2014 and 2021, show higher peaks, meaning hotter summer
months.
Median relative humidity falls between 30% and 40%.
In 2019, median values are higher, indicating more humid season.
All years have significant outliers, especially 2014, 2017, and 2021,
showing extremely humid days.
## `geom_smooth()` using formula = 'y ~ x'
Humidity values typically increase from August to October.
Notable peaks in summer are seen in 2016, 2019 and 2020.
The median wind speeds generally fall between 7 and 9 km/h.
There are significant outliers in all years, particularly noticeable in
2014, 2015, and 2018,
there were days with extremely high wind speeds.
## `geom_smooth()` using formula = 'y ~ x'
There is a clear seasonal pattern where wind speeds tend to be
higher in the summer months and decrease towards October.
Some years, like 2015 and 2020, show more pronounced peaks in wind
speed.
The overall trend remains consistent with seasonal variations.
Visualize with Wind Rose to see most common wind direction using converted wind speed.
# Convert wind speed from km/h to m/s
hotspots_peak <- hotspots_peak %>%
mutate(ws_m_s = ws / 3.6)
windRose(mydata = hotspots_peak, ws = "ws_m_s", wd = "wd",
main = "Wind Rose", paddle = FALSE)
Most of the wind comes from the west and southwest.
Regions east and north of areas with strong western, southern, and
southwestern winds
should be careful, as these winds can quickly spread fires.
The median values for precipitation are consistently very low,
often close to zero.
Despite this, there are significant outliers in every year, particularly
in 2017 and 2023.
These outliers indicate that certain days had much higher than average
rainfall.
## `geom_smooth()` using formula = 'y ~ x'
There is a notable peak in July for several years,
with some years, like 2018, 2019 and 2020,
showing higher average precipitation during these months.
Since precipitation often has many zero entries,
need to handle these outliers carefully to ensure they don’t skew our
results.
# Define the function to remove outliers
remove_outliers <- function(data, variable) {
q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE)
q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE)
iqr <- q3 - q1
lower_bound <- q1 - 1.5 * iqr
upper_bound <- q3 + 1.5 * iqr
data %>% filter(data[[variable]] >= lower_bound & data[[variable]] <= upper_bound)
}
# Remove outliers for temperature, relative humidity, and wind speed
hotspots_peak_clean <- hotspots_peak %>%
remove_outliers("temp") %>%
remove_outliers("rh") %>%
remove_outliers("ws")
# Check the summary statistics after removing outliers
summary_clean <- describe_numerical(hotspots_peak_clean, c('temp', 'rh', 'ws', 'wd', 'pcp'))
print(summary_clean)
## Variable Missing_Values Min Median Mean Max
## 1 temp 0 7.52 22.70 22.29 36.19
## 2 rh 0 11.00 32.00 33.51 59.00
## 3 ws 0 1.80 8.87 9.28 16.66
## 4 wd 0 0.00 212.00 202.28 360.00
## 5 pcp 0 0.00 0.00 0.14 46.46
The boxplots now show a more consistent distribution of
temperatures across different years.
The majority of the years have median temperatures between 20°C and
25°C.
The extreme outliers have been reduced, providing a clearer view of the
central temperature tendencies.
The median RH values generally remain between 30% and 40% across
the years.
2019 still shows higher median RH, indicating more humid
conditions.
Significant outliers have been reduced, giving a better understanding of
the general RH trends.
Median wind speeds for each year are now clearly visible,
generally falling between 7 and 10 km/h.
The removal of outliers helps to see typical wind speeds more
accurately.
The boxplots show that some years, like 2020 and 2021, still have a
wider range of wind speeds.
These indices are derived from weather conditions and provide insights into fire behavior and potential risks.
ffmc: Fine Fuel Moisture Code
dmc: Duff Moisture Code
dc: Drought Code
isi: Initial Spread Index
bui: Buildup Index
fwi: Fire Weather Index
# Summary of fire indices
summary_fire_indices <- describe_numerical(hotspots_peak, c('ffmc', 'dmc', 'dc', 'isi', 'bui', 'fwi'))
print(summary_fire_indices)
## Variable Missing_Values Min Median Mean Max
## 1 ffmc 0 0.3 91.50 89.53 99.00
## 2 dmc 0 0.0 85.20 92.60 459.78
## 3 dc 0 0.0 531.77 518.81 1122.11
## 4 isi 0 0.0 8.62 8.50 137.70
## 5 bui 0 0.0 119.12 123.42 458.66
## 6 fwi 0 0.0 30.09 29.13 183.90
A numeric rating of the moisture content of litter and other
cured fine fuels.
This code is an indicator of the relative ease of ignition and the
flammability of fine fuel.
The FFMC scale ranges from 0 to 101.
0-30: Very wet conditions; ignition is difficult.
30-70: Damp conditions; moderate difficulty for ignition.
70-85: Dry conditions; fuels are easily ignitable.
85-101: Very dry conditions; fuels are highly ignitable and fires can
spread rapidly.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The values in the dataset show that during the peak season, the FFMC values are generally high, with a median around 91.50 and a maximum of 99.00. This suggests that the fine fuels are often in a dry state, making them highly ignitable and prone to rapid fire spread.
Most years have high FFMC values above 90. Only 2019 and 2020 show slightly lower values.
## `geom_smooth()` using formula = 'y ~ x'
The lineplot shows that FFMC values peak every year during the fire season.
After removing outliers, the boxplots for FFMC show a more
consistent distribution across the years.
The median values are mostly above 90.
A numeric rating of the average moisture content
of loosely compacted organic layers of moderate depth.
This code gives an indication of fuel consumption in moderate duff
layers
and medium-size woody material.
It ranges typically from 0 to over 200, higher values indicate drier conditions and a higher fire risk.
0-20: Low fire danger, duff layers are wet, and ignition is unlikely. 21 - 40: Moderate fire danger, duff layers start drying. 41 - 100: High fire danger, the duff layer is dry, prone to ignition. 100 + : Extreme fire danger, the duff layer is very dry, and ignition is very likely with the potential for intense fires.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The histogram shows that most DMC values range between 0 and 150,
with a peak around 50-100.
Values above 200 are less common but indicate long dry periods.
The boxplots for each year show that 2017 and 2018 had
particularly high outliers,
indicating extremely dry conditions.
2019 had lower DMC values, showing less severe drought.
## `geom_smooth()` using formula = 'y ~ x'
The lineplot indicates that DMC values peak in late summer (August) and decrease in autumn.
After removing the outliers, the DMC values became more
consistent across different years.
The median values now mostly fall between 50 and 100, providing a
clearer picture of typical drought conditions.
While 2017 and 2021 still show higher values, the extremes have been
reduced.
A numeric rating of the average moisture content of deep, compact
organic layers.
This code is a useful indicator of seasonal drought effects on forest
fuels and the amount of smoldering
in deep duff layers and large logs.
0-100: Indicates wet conditions with low fire potential.
100-300: Moderate drought conditions with increasing fire
potential.
300-500: High drought conditions, leading to high fire risk.
500+: Indicates extreme drought, posing a very high fire risk and
potential for intense, prolonged burning.
DC is important because it helps understand how dry the forest
floor is
and the chance for deep-burning fires.
Higher DC values usually mean longer dry periods and can show how severe
fire seasons might be.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The histogram of DC values from 2014 to 2023 shows most values
are between 300 and 700, peaking around 500.
This means BC often has conditions that can lead to deep-burning
fires.
The boxplot of DC values across years show the variability and
trends in drought conditions over time.
2017 and 2023 have higher median values, meaning very dry
conditions.
2016 and 2019 show lower DC values, suggesting wetter conditions and
potentially less severe fire activity.
## `geom_smooth()` using formula = 'y ~ x'
The line plot of DC values by month shows a clear seasonal
trend,
DC values increase during the summer months (July to September) and
decrease in the autumn.
This occurs at the time of the peak fire season.
There are a lot of high DC values, more than 500, it shows conditions
for deep-burning fires.
The median DC values are more consistent, generally falling
between 250 and 550.
The range of the data has been reduced, with fewer extreme
outliers.
Create a combined plot from monthly average indices and fire
events per month tables
to show the trends of the FFMC, DMC and DC and the number of fire
occurrences over time.
# Merge the dataframes
monthly_data <- merge(monthly_avg, fire_events_per_month, by = c("year", "month"))
# Convert 'month' column from ordered factor to numeric
monthly_data$month <- as.numeric(monthly_data$month)
# Create the 'Date' column using 'year' and 'month'
monthly_data$Date <- as.Date(paste(monthly_data$year, monthly_data$month, "01", sep = "-"))
# Normalize the fire numbers data
max_events <- max(monthly_data$n_events, na.rm = TRUE)
monthly_data <- monthly_data %>%
mutate(norm_n_events = n_events / max_events)
# Maximum values for scaling
max_ffmc <- max(monthly_data$avg_ffmc, na.rm = TRUE)
max_dmc <- max(monthly_data$avg_dmc, na.rm = TRUE)
max_dc <- max(monthly_data$avg_dc, na.rm = TRUE)
The plots show a clear relationship between the fire indices
(FFMC, DMC, and DC) and the number of fires.
When the number of fires is high, the indices also show higher
values.
It’s important to note that while high indices indicate
conditions favorable for fires,
they alone do not cause fires.
A numerical rating of the expected rate of fire spread
based on wind speed, temperature, and fine fuel moisture content.
ISI is crucial for understanding how quickly a fire can spread once it
has ignited.
0-3: Low spread potential. Fires will spread slowly and are
relatively easy to control.
4-7: Moderate spread potential. Fires spread more quickly and may
require more effort to control.
8-12: High spread potential. Fires spread rapidly and can be difficult
to control.
13-19: Very high spread potential. Fires spread very rapidly and are
challenging to control.
20+: Extreme spread potential. Fires spread uncontrollably and can be
extremely dangerous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The boxplot shows variability in ISI values across years, with
notable outliers in 2014.
The histogram indicates that most ISI values are low to
moderate.
## `geom_smooth()` using formula = 'y ~ x'
ISI values tend to peak during the summer months, indicating higher fire spread potential during this period.
# Filter out entries with ISI values greater than 40
isi_outliers <- hotspots_peak %>%
filter(isi >= 40)
# Check the number of outliers
dim(isi_outliers)
## [1] 28 43
isi_outliers %>%
count(event_cluster) %>%
arrange(desc(n))
## # A tibble: 5 × 2
## event_cluster n
## <int> <int>
## 1 355 10
## 2 128 9
## 3 45 6
## 4 325 2
## 5 436 1
# Most isi outliers come from clusters 401 and 130
## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.
## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.
The data from July 2014, shows very high ISI values at a specific
location in British Columbia.
This matches the date of the Mount McAllister fire, a large wildfire
that burned over 20,000 hectares.
The cleaned data shows a clearer view of typical ISI values.
A numerical rating of the total amount of fuel available for
combustion.
It is derived from the Duff Moisture Code (DMC) and the Drought Code
(DC).
Low: 0-40
Moderate: 41-80
High: 81-120
Extreme: 121 and above
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
BUI values mostly range from 60 to 150.
This means the fire potential is moderate to high, with some extreme
cases, for most of the dataset.
## `geom_smooth()` using formula = 'y ~ x'
BUI peaks in mid-summer, which matches the time when fire activity is usually the highest.
After removing outliers the typical BUI values are between 50 and
150,
indicating moderate to high fire potential.
Outliers have been reduced, but 2017 and 2021 still show higher BUI
values.
A numeric rating of fire intensity. It is based on the ISI and
the BUI,
and is used as a general index of fire danger throughout the forested
areas of Canada.
Low (0-5): Minimal fire danger.
Moderate (6-12): Fires can start from most accidental causes, but the
spread is slow.
High (13-20): Fires can start easily and spread rapidly.
Very High (21-30): Fires will start very easily, spread rapidly, and
burn intensely.
Extreme (31+): Fires start and spread quickly, and are intense and
challenging to control.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The histogram shows that most FWI values are between 10 and 40,
fire danger is from moderate to extreme.
Many values are above 30, severe fire weather conditions are
frequent.
2014 had some extreme outliers.
## `geom_smooth()` using formula = 'y ~ x'
The lineplot shows that FWI peaks in mid-summer, peak fire activity period.
The FWI is calculated using the Initial Spread Index and the
Buildup Index,
both of which take into account wind speed,
temperature, humidity, and fuel moisture.
This is why FWI a reliable indicator of overall fire danger.
Fire agencies use the FWI to inform the public, issue fire bans,
and respond to fires.
It helps prioritize resources and actions to mitigate fire risks
effectively.
# Filter out entries with fwi values greater than 75
fwi_outliers <- hotspots_peak %>%
filter(fwi >= 75)
# Check the number of outliers
fwi_outliers %>%
count(event_cluster) %>%
arrange(desc(n))
## # A tibble: 5 × 2
## event_cluster n
## <int> <int>
## 1 355 10
## 2 128 9
## 3 45 6
## 4 325 2
## 5 436 1
# Most fwi outliers come from clusters 401 and 130, same as isi outliers
## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.
## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.
The analysis of FWI extreme outliers in 2014, specifically values
greater than 75,
reveals that these outliers are concentrated in the same event clusters
as the ISI outliers
(clusters 355 and 128).
This corresponds to the Mount McAllister fire in British Columbia, which
occurred in July 2014.
After removing outliers, the FWI boxplots show more consistent
distributions across the years.
Median FWI values generally fall between 20 and 40.
Create a combined plot from monthly average indices and fire
events per month tables
to show the trends of the ISI, BUI and FWI and the number of fire
occurrences over time.
# Use the previously normalized monthly_data table
# Maximum values for scaling
max_isi <- max(monthly_data$avg_isi, na.rm = TRUE)
max_bui <- max(monthly_data$avg_bui, na.rm = TRUE)
max_fwi <- max(monthly_data$avg_fwi, na.rm = TRUE)
The plots show a clear link between indices and the number of
fires.
When there are more fires, the FWI is also higher.
It’s important to remember that while high FWI values mean
conditions are good for fires,
they don’t indicate fire presence.
Also, while the FWI helps predict how severe fires might
be,
it can’t tell us exactly how many fires will happen in a season.
For example, the FWI was lower in 2018 than in 2017, but 2018 still had
some major fires.
This shows that indices can indicate fire conditions but don’t predict
the exact number of fires.
summary_fire_indices_clean <- describe_numerical(hotspots_peak_clean, c('ffmc', 'dmc', 'dc', 'isi', 'bui', 'fwi'))
print(summary_fire_indices_clean)
## Variable Missing_Values Min Median Mean Max
## 1 ffmc 0 55.37 91.64 90.85 99.00
## 2 dmc 0 0.00 87.19 95.75 459.78
## 3 dc 0 0.00 537.09 528.00 1122.11
## 4 isi 0 0.51 8.76 8.75 36.70
## 5 bui 0 0.00 120.77 127.70 458.66
## 6 fwi 0 2.89 30.55 30.17 56.96
These variables include additional factors that can influence fire events, such as fuel type and rate of spread.
fuel: Fuel Type
ros: Rate of Spread
hfi: Head Fire Intensity
D1: Deciduous trees (leafless, early spring to fall)
C2, C3, C4, C5, C7: Various types of coniferous trees
O1, O1a, O1b: Grass or herbaceous vegetation
M1, M1M2, M2, M2_25, M2_35, M2_50, M2_65: Mixedwood or transitional
vegetation
bog: Wetland areas with peatland vegetation
water: Water bodies
urban: Urban areas
non_fuel: Areas with no significant vegetation (rock, gravel)
low_veg: Areas with low vegetation (possibly recently burned or
cleared)
farm: Agricultural areas
D2: Deadfall or downed wood
# Summary of other variables
summary_other_variables <- describe_numerical(hotspots_peak, c('ros', 'hfi'))
print(summary_other_variables)
## Variable Missing_Values Min Median Mean Max
## 1 ros 0 0 4.46 6.69 96.23
## 2 hfi 0 0 3883.00 8776.72 93142.00
The predicted speed of the fire at the front or head of the fire
(where the fire moves fastest).
It is measured in metres per minute and is based on the Fuel Type,
Initial Spread Index, Buildup Index, and some others.
The scale ranges from 0 to over 96.23 m/min in the dataset.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The plot shows that the ROS can vary significantly year to year, with some years experiencing more extreme fire spread conditions.
The median ROS values now range between 3 and 10 m/min. Extreme values above 25 m/min have been significantly reduced.
Measures the intensity or energy output of a fire at its front
(head).
HFI is measured in kilowatts per metre (kW/m) of the fire front and is
calculated based on the Rate of Spread (ROS) and the Total Fuel
Consumption (TFC).
Low (0-500 kW/m): Fires are relatively easy to control and
generally cause limited damage.
Moderate (500-2000 kW/m): Fires can be challenging to control, requiring
more resources and effort.
High (2000-4000 kW/m): Fires are intense and difficult to manage, often
requiring aerial firefighting resources.
Very High (4000-10,000 kW/m): Fires are extremely intense and nearly
impossible to control, posing significant risk to life and
property.
Extreme (10,000+ kW/m): Fires exhibit explosive behavior and can cause
catastrophic damage.
Helps predict fire destructiveness, allocate resources, warn or evacuate public.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Most HFI values are concentrated at the lower end of the
scale.
Very high HFI values are outliers they show occasional extremely intense
fires.
While most fires are less intense, the few high-intensity fires can be
significant and impactful.
Boxplots show significant outliers in almost all years, there are some fires with extremely high intensities.
## `geom_smooth()` using formula = 'y ~ x'
Different years show a lot of differences in HFI values.
For example, 2014 and 2021 had higher peaks, meaning there was more
intense fire activity in those years.
2020 had lower HFI values, there were milder fire conditions or better
fire control efforts.
The plots also show how HFI can change from month to month within
a single fire season.
Weather conditions, temperature, humidity, and wind speed, can affect
how fires behave.
# Use the previously normalized monthly_data table
# Maximum values for scaling
max_hfi <- max(monthly_data$avg_hfi, na.rm = TRUE)
The HFI index shows notable peaks, particularly in the years 2017, 2018, 2021 and 2023.
# Filter out entries with HFI values greater than 75000
hfi_outliers <- hotspots_peak %>%
filter(hfi >= 75000)
# Check the number of outliers
hfi_outliers %>%
count(event_cluster) %>%
arrange(desc(n))
## # A tibble: 17 × 2
## event_cluster n
## <int> <int>
## 1 4094 78
## 2 3983 49
## 3 11877 41
## 4 4024 29
## 5 4310 9
## 6 4089 8
## 7 4357 7
## 8 4218 5
## 9 7478 5
## 10 11970 5
## 11 4493 4
## 12 45 2
## 13 3956 2
## 14 4096 2
## 15 4503 2
## 16 5299 2
## 17 4011 1
# Most hfi outliers come from clusters 4094, 3983, 11877 and 4024
# See the details of the events with high hfi
cluster_summary %>%
filter(event_cluster %in% c(4094, 3983, 11877, 4024))
## # A tibble: 4 × 7
## event_cluster start_date end_date num_events latitude
## <int> <dttm> <dttm> <int> <dbl>
## 1 3983 2018-08-11 11:29:00 2018-08-15 08:41:00 361 49.0
## 2 4024 2018-08-11 03:55:00 2018-08-14 03:22:38 313 49.5
## 3 4094 2018-07-18 15:42:00 2018-07-20 11:28:00 200 49.1
## 4 11877 2023-08-11 16:47:00 2023-08-20 05:05:00 6901 49.1
## # ℹ 2 more variables: longitude <dbl>, distance_to_kelowna_fire <dbl>
Snowy Mountain Fire and other fires in the Okanagan region in
2018.
Placer Mountain Fire area, a notable fire from 2018, which also burned
in 2023.
## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.
## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.
## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.
## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.
The median HFI values are generally below 20,000 kW/m after removing outliers.
In this section, we use statistical tests to see some meaningful patters and build models to predict fire risk.
Subset the hotspots dataset to focus on specific numerical variables.
# Define the numeric columns
numeric_columns <- c("temp", "rh", "ws", "pcp", "ffmc", "dmc", "dc", "isi", "bui", "fwi", "ros", "hfi")
# Subset the dataframe
hotspots_test <- hotspots_peak_clean %>%
select(all_of(numeric_columns), year, month, rep_date, event_cluster)
str(hotspots_test)
## tibble [1,659,019 × 16] (S3: tbl_df/tbl/data.frame)
## $ temp : num [1:1659019] 21.2 26.8 27.8 23.4 24.6 28.8 25.5 17.2 28.2 26.1 ...
## $ rh : num [1:1659019] 32 27 25 43 27 30 29 36 23 21 ...
## $ ws : num [1:1659019] 12.4 4.4 11.1 6.5 6.7 10.6 7.6 13.6 9.7 9 ...
## $ pcp : num [1:1659019] 0 0 0 6.6 0 0 0 0.6 0 0 ...
## $ ffmc : num [1:1659019] 92.9 92.9 91.3 70.4 94.7 92.5 93.9 89 94.5 94.8 ...
## $ dmc : num [1:1659019] 114.9 89.1 83.3 70.4 63.8 ...
## $ dc : num [1:1659019] 550 503 632 382 255 ...
## $ isi : num [1:1659019] 12 8.1 9 0.9 11.7 10.5 10.9 7.3 13.2 13.2 ...
## $ bui : num [1:1659019] 150.9 123.5 125.3 96.4 78.5 ...
## $ fwi : num [1:1659019] 41.4 29.9 32.3 4.3 31.1 39 37.1 29.1 39.4 42 ...
## $ ros : num [1:1659019] 0.624 10.151 0.415 0.436 16.388 ...
## $ hfi : num [1:1659019] 263 13691 168 439 18430 ...
## $ year : num [1:1659019] 2014 2014 2014 2014 2014 ...
## $ month : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 8 8 8 8 7 8 8 8 8 7 ...
## $ rep_date : POSIXct[1:1659019], format: "2014-08-06 05:11:00" "2014-08-14 10:14:00" ...
## $ event_cluster: int [1:1659019] 1 2 3 4 5 6 6 7 8 9 ...
# List of fire indices to summarize
fire_indices <- c('ffmc', 'dmc', 'dc', 'isi', 'bui', 'fwi')
# Apply the describe_numerical function to the fire indices in the hotspots_test dataset
summary_fire_indices <- describe_numerical(hotspots_test, fire_indices)
# Print the summary statistics for the fire indices
print(summary_fire_indices)
## Variable Missing_Values Min Median Mean Max
## 1 ffmc 0 0.3 91.36 89.32 99.00
## 2 dmc 0 0.0 83.41 90.12 459.78
## 3 dc 0 0.0 530.41 516.47 1122.11
## 4 isi 0 0.0 8.44 8.24 137.70
## 5 bui 0 0.0 117.65 121.04 458.66
## 6 fwi 0 0.0 29.52 28.37 183.90
During EDA we looked into distributions of six key Fire Indices across different years.
The distributions of these indices for a particular year, 2018,
appeared to be close to the overall average values of these indices.
We also found out that 2018 had some significant fire events.
# Average number of observations per year
nrow(hotspots_peak) / length(unique(hotspots_peak$year))
## [1] 173024.2
# Average number of unique event_clusters per year
length(unique(hotspots_peak$event_cluster)) / length(unique(hotspots_peak$year))
## [1] 1502.6
# Number of observations in 2018
nrow(filter(hotspots_peak_clean, year == 2018))
## [1] 339604
# Number of unique event clusters in 2018
length(unique(filter(hotspots_peak_clean, year == 2018)$event_cluster))
## [1] 1932
From these calculations, we found that 2018 had a significantly higher number of observations and event clusters compared to the average over the ten-year period.
Despite the higher number of fire events, the main indices (FFMC, DMC, DC, ISI, BUI, FWI) remained close to their median values across the dataset.
Therefore we wanted to test if the increased number of fires in 2018 was due to random chance or if other factors were at play.
We need to perform statistical tests to compare the indices for 2018 against the overall dataset.
A p-value greater than 0.05 would suggest that the higher number
of fires could be due to a chance,
and a lower p-value would mean the presence of other contributing
factors.
Summary statistics for before T Test
# Summary statistics for the overall dataset
summary_overall <- describe_numerical(hotspots_peak, fire_indices)
summary_overall
## Variable Missing_Values Min Median Mean Max
## 1 ffmc 0 0.3 91.50 89.53 99.00
## 2 dmc 0 0.0 85.20 92.60 459.78
## 3 dc 0 0.0 531.77 518.81 1122.11
## 4 isi 0 0.0 8.62 8.50 137.70
## 5 bui 0 0.0 119.12 123.42 458.66
## 6 fwi 0 0.0 30.09 29.13 183.90
# Filter data for the year 2018
hotspots_test_2018 <- hotspots_test %>% filter(year == 2018)
# Summary statistics for the year 2018
summary_2018 <- describe_numerical(hotspots_test_2018, fire_indices)
summary_2018
## Variable Missing_Values Min Median Mean Max
## 1 ffmc 0 11.38 91.27 90.07 99.00
## 2 dmc 0 0.00 100.13 97.82 343.14
## 3 dc 0 0.00 492.52 486.80 878.16
## 4 isi 0 0.00 8.22 7.90 24.81
## 5 bui 0 0.00 133.64 128.35 341.15
## 6 fwi 0 0.00 30.72 28.83 57.93
Check the histograms if they follow the normal distribution.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Index Overall_Median Year_2018_Median Difference Percentage_Difference
## 1 ffmc 91.50 91.27 -0.23 -0.25
## 2 dmc 85.20 100.13 14.93 17.52
## 3 dc 531.77 492.52 -39.25 -7.38
## 4 isi 8.62 8.22 -0.40 -4.64
## 5 bui 119.12 133.64 14.52 12.19
## 6 fwi 30.09 30.72 0.63 2.09
During our normality checks, some of the variables did not show a smooth bell curve, meaning that they do not follow a normal distribution. We decided to perform nonparametric tests, which do not assume a normal distribution of the data, to compare the results.
# Perform Wilcoxon tests
test_results_wilcoxon <- lapply(fire_indices, function(index) {
wilcox.test(hotspots_peak_clean[[index]], hotspots_test_2018[[index]])
})
test_results_wilcoxon
## [[1]]
##
## Wilcoxon rank sum test with continuity correction
##
## data: hotspots_peak_clean[[index]] and hotspots_test_2018[[index]]
## W = 2.9981e+11, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
##
## [[2]]
##
## Wilcoxon rank sum test with continuity correction
##
## data: hotspots_peak_clean[[index]] and hotspots_test_2018[[index]]
## W = 2.2308e+11, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
##
## [[3]]
##
## Wilcoxon rank sum test with continuity correction
##
## data: hotspots_peak_clean[[index]] and hotspots_test_2018[[index]]
## W = 3.2781e+11, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
##
## [[4]]
##
## Wilcoxon rank sum test with continuity correction
##
## data: hotspots_peak_clean[[index]] and hotspots_test_2018[[index]]
## W = 3.0155e+11, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
##
## [[5]]
##
## Wilcoxon rank sum test with continuity correction
##
## data: hotspots_peak_clean[[index]] and hotspots_test_2018[[index]]
## W = 2.329e+11, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
##
## [[6]]
##
## Wilcoxon rank sum test with continuity correction
##
## data: hotspots_peak_clean[[index]] and hotspots_test_2018[[index]]
## W = 2.761e+11, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
# Print summary table for Wilcoxon test
summary_table_wilcoxon <- create_summary_table(test_results_wilcoxon)
print(summary_table_wilcoxon)
## Index Overall_Mean Y_2018_Mean Overall_Median Y_2018_Median P_Value
## 1 ffmc 89.53 90.07 91.50 91.27 <0.000001
## 2 dmc 92.60 97.82 85.20 100.13 <0.000001
## 3 dc 518.81 486.80 531.77 492.52 <0.000001
## 4 isi 8.50 7.90 8.62 8.22 <0.000001
## 5 bui 123.42 128.35 119.12 133.64 <0.000001
## 6 fwi 29.13 28.83 30.09 30.72 <0.000001
## Significant
## 1 Yes
## 2 Yes
## 3 Yes
## 4 Yes
## 5 Yes
## 6 Yes
All six fire indices show significant differences between the values in 2018 and the overall values across all years.
This suggests that the conditions in 2018 were statistically different from the overall conditions in the dataset.
There were likely specific factors in 2018 that contributed to these differences.
While the t-tests show that there are significant differences in the fire indices, they don’t prove that higher values of these indices cause more fires.
However, the higher values of indices like FWI, DMC, and others in 2018 suggest there is a link, meaning higher values of these indices likely mean conditions for more fires.
To find the relationship between these indices and the number of fires, need to make more analysis, like regression modeling.
Further we wanted to test similar clusters.
We need to find two clusters of fire events with similar characteristics, mean indices, perform statistical tests to compare their indices, and visualize the results.
We selected clusters based on the similarity of their mean Fire Weather Index (FWI) and Drought Code (DC).
# Calculate summary statistics for each cluster
cluster_summary_indices <- hotspots_test %>%
group_by(event_cluster) %>%
summarise(
num_events = n(),
mean_fwi = mean(fwi), # Mean FWI value for each cluster
mean_dc = mean(dc) # Mean DC value for each cluster
)
# Filter clusters that have 1000+ events
chosen_clusters <- cluster_summary_indices %>%
filter(num_events >= 1000 & num_events <= 6000)
# Function to calculate how similar mean values are
calculate_similar <- function(df) {
df %>%
mutate(
fwi_diff = abs(mean_fwi - mean(chosen_clusters$mean_fwi)),
dc_diff = abs(mean_dc - mean(chosen_clusters$mean_dc)),
similar = fwi_diff + dc_diff
) %>%
arrange(similar)
}
# Apply the function and choose two clusters with the closest mean FWI and DC
chosen_clusters <- chosen_clusters %>%
calculate_similar() %>%
slice(1:5)
# Chosen clusters
print(chosen_clusters)
## # A tibble: 5 × 7
## event_cluster num_events mean_fwi mean_dc fwi_diff dc_diff similar
## <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3953 3460 32.5 523. 2.67 0.192 2.86
## 2 4009 1217 31.7 520. 1.92 3.32 5.24
## 3 3949 1661 32.3 528. 2.45 3.95 6.40
## 4 13137 1122 27.2 520. 2.64 3.99 6.64
## 5 8019 3359 35.8 528. 6.00 3.98 9.98
# Chosen cluster IDs
chosen_ids <- c(4009, 13137)
# Summary table for the chosen clusters
cluster_summary <- hotspots_test %>%
filter(event_cluster %in% chosen_ids) %>%
group_by(event_cluster, year, month) %>%
summarize(
count = n(),
mean_ffmc = mean(ffmc, na.rm = TRUE),
mean_dmc = mean(dmc, na.rm = TRUE),
mean_dc = mean(dc, na.rm = TRUE),
mean_isi = mean(isi, na.rm = TRUE),
mean_bui = mean(bui, na.rm = TRUE),
mean_fwi = mean(fwi, na.rm = TRUE)
)
## `summarise()` has grouped output by 'event_cluster', 'year'. You can override
## using the `.groups` argument.
print(cluster_summary)
## # A tibble: 2 × 10
## # Groups: event_cluster, year [2]
## event_cluster year month count mean_ffmc mean_dmc mean_dc mean_isi mean_bui
## <int> <dbl> <ord> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4009 2018 Aug 1217 90.3 108. 520. 8.27 142.
## 2 13137 2023 Aug 1122 92.6 53.7 520. 9.10 85.1
## # ℹ 1 more variable: mean_fwi <dbl>
Plot the clusters on the map
## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.
## ℹ 156 tiles needed, this may take a while (try a smaller zoom?)
Subset the dataframe for the chosen clusters
# Filter data for the two chosen clusters
cluster_1 <- hotspots_test %>% filter(event_cluster == chosen_ids[1])
cluster_2 <- hotspots_test %>% filter(event_cluster == chosen_ids[2])
Visualize the distribution
# Perform t-test on DC for the two clusters
t_test_result <- t.test(cluster_1$dc, cluster_2$dc, alternative = "two.sided")
# Print the t-test result
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: cluster_1$dc and cluster_2$dc
## t = 0.40558, df = 2184.3, p-value = 0.6851
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.568158 3.907413
## sample estimates:
## mean of x mean of y
## 520.3556 519.6860
The t-test shows no significant difference in the DC values between the two clusters (cluster_1 and cluster_2).
The p-value is greater than 0.05, and the confidence interval suggests that the observed differences could be due to random chance rather than a true difference in means.
Therefore, the DC values for these clusters are statistically similar.
Analyze the weekly number of fires using various fire index predictors. The goal is to build a linear regression model to predict the number of fires and see its performance.
# Create a week column
hotspots_test$week <- format(hotspots_test$rep_date, "%Y-%U")
# Create weekly summary with event counts (fires) and average weekly indices
weekly_summary <- hotspots_test %>%
group_by(week) %>%
summarise(
fires = n(),
hfi = mean(hfi, na.rm = TRUE),
temp = mean(temp, na.rm = TRUE),
rh = mean(rh, na.rm = TRUE),
ws = mean(ws, na.rm = TRUE),
pcp = mean(pcp, na.rm = TRUE),
ffmc = mean(ffmc, na.rm = TRUE),
dmc = mean(dmc, na.rm = TRUE),
dc = mean(dc, na.rm = TRUE),
isi = mean(isi, na.rm = TRUE),
bui = mean(bui, na.rm = TRUE),
fwi = mean(fwi, na.rm = TRUE),
ros = mean(ros, na.rm = TRUE)
)
weekly_summary
## # A tibble: 263 × 14
## week fires hfi temp rh ws pcp ffmc dmc dc isi bui
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2014-17 6 5013. 19.5 24.3 10.1 0 91.6 21.4 124. 8.97 29.3
## 2 2014-18 57 1420. 10.1 23.4 12.9 7.02e-4 90.8 21.6 57.9 10.3 22.8
## 3 2014-19 98 914. 18.7 31.4 7.63 2.17e+0 83.9 27.3 69.5 6.39 28.7
## 4 2014-20 3 2907. 13.2 20 19.1 0 91.6 46.4 108 14.2 46.3
## 5 2014-21 21 2267. 18.9 37.0 6.64 3.95e-2 89.8 42.1 121. 6.6 45.6
## 6 2014-22 74 10103. 20.8 22.7 9.75 2.70e-4 93.1 68.3 160. 11.2 70.1
## 7 2014-23 8 7724 19.0 42.8 10.2 4.5 e+0 78.4 51.2 220. 6.78 63.0
## 8 2014-24 3 0 18 45 5.6 1.13e+1 52.8 16.8 313. 0.3 29.7
## 9 2014-25 7 799. 20.8 35.1 6.69 0 83.5 19.3 156. 2.57 29.4
## 10 2014-26 6 2390. 21.8 39.3 12.7 6.67e-1 88.6 49.1 283. 6.93 68.5
## # ℹ 253 more rows
## # ℹ 2 more variables: fwi <dbl>, ros <dbl>
Transform the number of fires to get more normally distributed data
# Best parameter Box-Cox
boxcox_result <- MASS::boxcox(lm(fires ~ 1, data = weekly_summary))
lm_best <- boxcox_result$x[which.max(boxcox_result$y)]
lm_best
## [1] 0.02020202
# Transform with lm_best
weekly_summary <- weekly_summary %>%
mutate(boxcox_fires = ((fires + 1)^lm_best - 1) / lm_best)
# Model 1
model_1 <- lm(boxcox_fires ~ dmc + dc + bui + fwi + hfi + ffmc + isi, data = weekly_summary)
summary(model_1)
##
## Call:
## lm(formula = boxcox_fires ~ dmc + dc + bui + fwi + hfi + ffmc +
## isi, data = weekly_summary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9510 -1.5283 -0.0813 1.6149 6.9366
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.989e+00 1.784e+00 2.236 0.026250 *
## dmc 1.087e-01 3.181e-02 3.417 0.000737 ***
## dc 1.236e-02 2.257e-03 5.477 1.04e-07 ***
## bui -1.311e-01 3.380e-02 -3.879 0.000134 ***
## fwi 2.760e-01 8.356e-02 3.302 0.001095 **
## hfi 2.107e-04 6.325e-05 3.331 0.000993 ***
## ffmc -3.667e-03 2.366e-02 -0.155 0.876942
## isi -5.540e-01 1.725e-01 -3.212 0.001486 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.154 on 255 degrees of freedom
## Multiple R-squared: 0.4635, Adjusted R-squared: 0.4488
## F-statistic: 31.47 on 7 and 255 DF, p-value: < 2.2e-16
# Check for multicollinearity
vif(model_1)
## dmc dc bui fwi hfi ffmc isi
## 88.184858 8.172096 155.390803 58.119286 3.557544 5.034248 25.387775
# Remove bui
model_no_bui <- lm(boxcox_fires ~ dmc + dc + fwi + hfi + ffmc + isi, data = weekly_summary)
summary(model_no_bui)
##
## Call:
## lm(formula = boxcox_fires ~ dmc + dc + fwi + hfi + ffmc + isi,
## data = weekly_summary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3174 -1.6061 -0.0923 1.5856 6.5768
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.620e+00 1.560e+00 4.884 1.83e-06 ***
## dmc -8.390e-03 1.032e-02 -0.813 0.416854
## dc 5.504e-03 1.441e-03 3.819 0.000168 ***
## fwi 1.507e-01 7.915e-02 1.904 0.058067 .
## hfi 2.418e-04 6.444e-05 3.753 0.000216 ***
## ffmc -4.922e-02 2.110e-02 -2.333 0.020410 *
## isi -3.101e-01 1.649e-01 -1.880 0.061247 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.212 on 256 degrees of freedom
## Multiple R-squared: 0.4318, Adjusted R-squared: 0.4185
## F-statistic: 32.43 on 6 and 256 DF, p-value: < 2.2e-16
# Check for multicollinearity
vif(model_no_bui)
## dmc dc fwi hfi ffmc isi
## 8.793841 3.157935 49.437948 3.500272 3.793957 22.012746
# Remove fwi
model_no_fwi <- lm(boxcox_fires ~ dmc + dc + hfi + ffmc + isi, data = weekly_summary)
summary(model_no_fwi)
##
## Call:
## lm(formula = boxcox_fires ~ dmc + dc + hfi + ffmc + isi, data = weekly_summary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.6039 -1.6627 -0.0623 1.5154 5.6961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.433e+00 1.438e+00 4.475 1.15e-05 ***
## dmc 6.331e-03 6.865e-03 0.922 0.357
## dc 7.086e-03 1.183e-03 5.988 7.11e-09 ***
## hfi 2.790e-04 6.173e-05 4.519 9.47e-06 ***
## ffmc -4.130e-02 2.079e-02 -1.987 0.048 *
## isi -3.511e-02 8.004e-02 -0.439 0.661
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.223 on 257 degrees of freedom
## Multiple R-squared: 0.4238, Adjusted R-squared: 0.4126
## F-statistic: 37.8 on 5 and 257 DF, p-value: < 2.2e-16
# Check for multicollinearity
vif(model_no_fwi)
## dmc dc hfi ffmc isi
## 3.853991 2.107379 3.179478 3.646212 5.131314
# Remove non-significant predictors
model_final <- lm(boxcox_fires ~ dc + hfi + ffmc, data = weekly_summary)
summary(model_final)
##
## Call:
## lm(formula = boxcox_fires ~ dc + hfi + ffmc, data = weekly_summary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.8259 -1.6849 -0.0558 1.5553 5.5430
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.171e+00 1.079e+00 5.720 2.93e-08 ***
## dc 7.872e-03 8.840e-04 8.905 < 2e-16 ***
## hfi 2.819e-04 4.518e-05 6.241 1.76e-09 ***
## ffmc -4.025e-02 1.328e-02 -3.030 0.00269 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.219 on 259 degrees of freedom
## Multiple R-squared: 0.4215, Adjusted R-squared: 0.4148
## F-statistic: 62.9 on 3 and 259 DF, p-value: < 2.2e-16
# Check for multicollinearity
vif(model_final)
## dc hfi ffmc
## 1.180492 1.709183 1.494406
# Create a dataframe with hypothetical conditions (moderate, high, extreme)
test_1 <- data.frame(
dc = c(200, 400, 600),
hfi = c(1000, 7000, 12000),
ffmc = c(50, 80, 95)
)
# Predict the Box-Cox transformed number of fires
predicted_boxcox_fires_test_1 <- predict(model_final, newdata = test_1)
# Reverse Box-Cox transformation
reverse_boxcox <- function(y, lambda) {
if (lambda == 0) {
exp(y) - 1
} else {
((y * lambda) + 1)^(1 / lambda) - 1
}
}
# Transform to original scale
predicted_fires_test_1 <- reverse_boxcox(predicted_boxcox_fires_test_1, lm_best)
# Model predictions
result_test_1 <- cbind(test_1, predicted_fires_test_1)
print(result_test_1)
## dc hfi ffmc predicted_fires_test_1
## 1 200 1000 50 290.7634
## 2 400 7000 80 1767.9751
## 3 600 12000 95 13139.9558
# Create a dataframe with hypothetical conditions (only HFI changes)
test_2 <- data.frame(
dc = rep(300, 5), # Constant value for Drought Code
hfi = seq(2000, 10000, by = 2000),# Varying values for Head Fire Intensity
ffmc = rep(70, 5) # Constant value for Fine Fuel Moisture Code
)
# Predict the Box-Cox transformed number of fires
predicted_boxcox_fires_test_2 <- predict(model_final, newdata = test_2)
# Transform to original scale
predicted_fires_test_2 <- reverse_boxcox(predicted_boxcox_fires_test_2, lm_best)
# Model predictions
result_test_2 <- cbind(test_2, predicted_fires_test_2)
print(result_test_2)
## dc hfi ffmc predicted_fires_test_2
## 1 300 2000 70 368.0153
## 2 300 4000 70 606.1323
## 3 300 6000 70 992.9600
## 4 300 8000 70 1618.3602
## 5 300 10000 70 2624.7191
kelowna_cluster_ids <- kelowna_clusters$event_cluster
# Summary table for Kelowna
kelowna_summary <- hotspots_test %>%
filter(event_cluster %in% kelowna_cluster_ids) %>%
group_by(event_cluster, year, month) %>%
summarize(
count = n(),
mean_dc = mean(dc, na.rm = TRUE),
mean_hfi = mean(hfi, na.rm = TRUE),
mean_ffmc = mean(ffmc, na.rm = TRUE),
start_date = min(rep_date),
end_date = max(rep_date),
.groups = 'drop'
)
print(kelowna_summary)
## # A tibble: 7 × 9
## event_cluster year month count mean_dc mean_hfi mean_ffmc start_date
## <int> <dbl> <ord> <int> <dbl> <dbl> <dbl> <dttm>
## 1 11964 2023 Aug 982 866. 3415. 91.7 2023-08-25 12:44:00
## 2 11969 2023 Aug 2153 943. 7212. 93.9 2023-08-16 16:54:00
## 3 13047 2023 Aug 446 889. 3199. 91.8 2023-08-20 16:28:00
## 4 13124 2023 Aug 252 886. 3788. 92.1 2023-08-21 16:09:00
## 5 13384 2023 Aug 15 894. 1240 87.4 2023-08-23 01:51:00
## 6 13402 2023 Aug 4 1017. 146. 83.0 2023-08-23 03:19:00
## 7 13694 2023 Aug 8 943. 249. 83.5 2023-08-25 01:36:00
## # ℹ 1 more variable: end_date <dttm>
# Create a dataframe with Kelowna data
test_Kelowna <- data.frame(
dc = kelowna_summary$mean_dc,
hfi = kelowna_summary$mean_hfi,
ffmc = kelowna_summary$mean_ffmc
)
predicted_boxcox_fires_test_Kelowna <- predict(model_final, newdata = test_Kelowna)
predicted_fires_test_Kelowna <- reverse_boxcox(predicted_boxcox_fires_test_Kelowna, lm_best)
# Result
result_test_Kelowna <- cbind(test_Kelowna, predicted_fires_test_Kelowna)
result_test_Kelowna
## dc hfi ffmc predicted_fires_test_Kelowna
## 1 865.9385 3414.656 91.68305 11194.53
## 2 942.7162 7212.464 93.90707 40926.27
## 3 889.2258 3198.713 91.80471 12338.04
## 4 886.0876 3788.135 92.06285 13748.79
## 5 894.1928 1240.000 87.39640 9340.41
## 6 1017.0992 146.250 82.97425 18643.13
## 7 943.0106 249.375 83.53512 11592.75
mean(result_test_Kelowna$predicted_fires_test_Kelowna, na.rm = TRUE)
## [1] 16826.28
# Summary table for Kelowna
kelowna_summary <- hotspots_test %>%
filter(event_cluster %in% kelowna_cluster_ids) %>%
group_by(event_cluster, year, month) %>%
summarize(
count = n(),
mean_dc = mean(dc, na.rm = TRUE),
mean_hfi = mean(hfi, na.rm = TRUE),
mean_ffmc = mean(ffmc, na.rm = TRUE),
mean_dmc = mean(dmc, na.rm = TRUE),
mean_isi = mean(isi, na.rm = TRUE),
mean_bui = mean(bui, na.rm = TRUE),
mean_fwi = mean(fwi, na.rm = TRUE),
start_date = min(rep_date),
end_date = max(rep_date),
.groups = 'drop'
)
print(kelowna_summary)
## # A tibble: 7 × 13
## event_cluster year month count mean_dc mean_hfi mean_ffmc mean_dmc mean_isi
## <int> <dbl> <ord> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 11964 2023 Aug 982 866. 3415. 91.7 98.8 8.18
## 2 11969 2023 Aug 2153 943. 7212. 93.9 179. 16.0
## 3 13047 2023 Aug 446 889. 3199. 91.8 166. 7.87
## 4 13124 2023 Aug 252 886. 3788. 92.1 167. 7.94
## 5 13384 2023 Aug 15 894. 1240 87.4 166. 5.21
## 6 13402 2023 Aug 4 1017. 146. 83.0 187. 2.69
## 7 13694 2023 Aug 8 943. 249. 83.5 112. 3.05
## # ℹ 4 more variables: mean_bui <dbl>, mean_fwi <dbl>, start_date <dttm>,
## # end_date <dttm>
# Create a dataframe with Kelowna data
test_Kelowna_2 <- data.frame(
dc = kelowna_summary$mean_dc,
hfi = kelowna_summary$mean_hfi,
ffmc = kelowna_summary$mean_ffmc,
dmc = kelowna_summary$mean_dmc,
isi = kelowna_summary$mean_isi,
bui = kelowna_summary$mean_bui,
fwi = kelowna_summary$mean_fwi
)
predicted_boxcox_fires_test_Kelowna_2 <- predict(model_1, newdata = test_Kelowna_2)
predicted_fires_test_Kelowna_2 <- reverse_boxcox(predicted_boxcox_fires_test_Kelowna_2, lm_best)
# Result
result_test_Kelowna_2 <- cbind(test_Kelowna_2, predicted_fires_test_Kelowna_2)
result_test_Kelowna_2
## dc hfi ffmc dmc isi bui fwi
## 1 865.9385 3414.656 91.68305 98.82007 8.181949 153.6726 32.15215
## 2 942.7162 7212.464 93.90707 179.38759 16.005370 243.0274 52.63774
## 3 889.2258 3198.713 91.80471 166.09449 7.865226 226.4096 33.58339
## 4 886.0876 3788.135 92.06285 166.91505 7.942167 226.9296 33.81845
## 5 894.1928 1240.000 87.39640 165.54847 5.211667 226.2989 25.38913
## 6 1017.0992 146.250 82.97425 187.42925 2.691250 256.6035 15.87200
## 7 943.0106 249.375 83.53512 112.02188 3.051125 172.6675 16.55725
## predicted_fires_test_Kelowna_2
## 1 9104.7784
## 2 9910.6061
## 3 2773.5525
## 4 3092.0990
## 5 1011.5985
## 6 264.3268
## 7 1409.2541
mean(result_test_Kelowna_2$predicted_fires_test_Kelowna_2, na.rm = TRUE)
## [1] 3938.031